'''
Created on Sep 24, 2011

@author: Nam Khanh Tran
'''

import sys
import os
from html2text import *

DIR = "/home/hp/workspace/cleanWebData/"

def _html2text(filename):
    """
    """
    print filename
    encoding = None
    data = open(filename, 'rb').read()
    if encoding is None:
        try:
            from chardet import detect
        except ImportError:
            detect = lambda x: {'encoding': 'utf-8'}
        encoding = detect(data)['encoding']
    data = data.decode(encoding)

    text = html2text(data).encode('utf-8')
    # File result
    pos1 = filename.find('&id=')
    pos2 = filename.find('&', pos1+4)
    if pos1 == -1:
        fname = 'verb.index'
    else:
        if pos2 != -1:
            fname = filename[pos1+4:pos2]
        else:
            fname = filename[pos1+4:]
        if 'edoc' in filename:
            fname = fname + '.edoc'
        elif 'doc' in filename:
            fname = fname + '.doc'
        elif 'index' in filename:
            fname = fname + '.pat'
        if filename.find('num=') != -1:
            fname = fname + '.' + filename[filename.find('num=') + 4:]
        if filename.find('pat=') != -1:
            fname = fname + '.' + filename[filename.find('pat=') + 4:]
    
    fout = open(DIR + fname, 'w')
    try:
        if 'index' in filename:
            fout.write(text)
            return
        
        text = text.replace('*','')
        example = list()
        for line in text.split('\n'):
            if len(line) > 0:
                example.append(line)
            if len(example) == 4:
                example.reverse()
                while len(example) > 1:
                    fout.write(example.pop() + '\t')
                    
                last_field = example.pop()
                if last_field[:2].isdigit():
                    if 'edoc' in fname or '.e' in fname:
                        fout.write("%s\t%s\n" % (last_field[:4],last_field[4:]))
                    else:
                        fout.write("%s\t%s\n" % (last_field[:2],last_field[2:]))
                else:
                    if 'edoc' in fname or '.e' in fname:
                        fout.write("%s\t%s\n" % (last_field[:3],last_field[3:]))
                    else:
                        fout.write("%s\t%s\n" % (last_field[:1],last_field[1:]))
    finally:
        fout.close()
        
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print '\nUsage: python my_html2text.py [directory]'
        sys.exit(1)
    
    for filename in os.listdir(sys.argv[1]):
        if '.css' in filename:
            continue
        _html2text(sys.argv[1] + '/' + filename)